Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
Load the dataset
df = pd.read_csv("XYZloan_default_selected_vars.csv")
df.head()
| Unnamed: 0.1 | Unnamed: 0 | id | loan_default | AP001 | AP002 | AP003 | AP004 | AP005 | AP006 | ... | CD162 | CD164 | CD166 | CD167 | CD169 | CD170 | CD172 | CD173 | MB005 | MB007 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 1 | 1 | 31 | 2 | 1 | 12 | 2017/7/6 10:21 | ios | ... | 13.0 | 13.0 | 0.0 | 0.0 | 1449.0 | 1449.0 | 2249.0 | 2249.0 | 7.0 | IPHONE7 |
| 1 | 1 | 2 | 2 | 0 | 27 | 1 | 1 | 12 | 2017/4/6 12:51 | h5 | ... | -99.0 | -99.0 | -99.0 | -99.0 | -99.0 | -99.0 | -99.0 | -99.0 | NaN | WEB |
| 2 | 2 | 3 | 3 | 0 | 33 | 1 | 4 | 12 | 2017/7/1 14:11 | h5 | ... | 3.0 | 2.0 | 33.0 | 0.0 | 33.0 | 0.0 | 143.0 | 110.0 | 8.0 | WEB |
| 3 | 3 | 4 | 4 | 0 | 34 | 2 | 4 | 12 | 2017/7/7 10:10 | android | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 | OPPO |
| 4 | 4 | 5 | 5 | 0 | 47 | 2 | 1 | 12 | 2017/7/6 14:37 | h5 | ... | -99.0 | -99.0 | -99.0 | -99.0 | -99.0 | -99.0 | -99.0 | -99.0 | NaN | WEB |
5 rows × 89 columns
Summary statistics for the dataset
print(df.shape)
(80000, 89)
print(df.describe())
Unnamed: 0.1 Unnamed: 0 id loan_default AP001 \
count 80000.000000 80000.000000 80000.000000 80000.000000 80000.000000
mean 39999.500000 40000.500000 40000.500000 0.193600 31.706913
std 23094.155105 23094.155105 23094.155105 0.395121 7.075070
min 0.000000 1.000000 1.000000 0.000000 20.000000
25% 19999.750000 20000.750000 20000.750000 0.000000 27.000000
50% 39999.500000 40000.500000 40000.500000 0.000000 30.000000
75% 59999.250000 60000.250000 60000.250000 0.000000 35.000000
max 79999.000000 80000.000000 80000.000000 1.000000 56.000000
AP002 AP003 AP004 AP007 AP008 \
count 80000.000000 80000.000000 80000.000000 80000.00000 80000.000000
mean 1.321813 2.014925 11.235413 3.30130 3.117200
std 0.467174 1.196806 2.212313 1.33655 1.306335
min 1.000000 1.000000 3.000000 1.00000 1.000000
25% 1.000000 1.000000 12.000000 2.00000 2.000000
50% 1.000000 1.000000 12.000000 3.00000 3.000000
75% 2.000000 3.000000 12.000000 5.00000 4.000000
max 2.000000 6.000000 12.000000 5.00000 5.000000
... CD160 CD162 CD164 CD166 \
count ... 79619.000000 79619.000000 79619.000000 79619.000000
mean ... 6.911956 14.271694 11.773358 909.089313
std ... 28.007499 38.235012 33.270641 1379.553332
min ... -99.000000 -99.000000 -99.000000 -99.000000
25% ... 2.000000 5.000000 4.000000 84.000000
50% ... 7.000000 12.000000 10.000000 475.000000
75% ... 14.000000 23.000000 20.000000 1209.000000
max ... 1061.000000 2792.000000 1579.000000 48585.000000
CD167 CD169 CD170 CD172 CD173 \
count 79619.000000 79619.000000 79619.00000 79619.000000 79619.000000
mean 810.786219 1732.693314 1539.33443 2513.226491 2229.606137
std 1245.044602 2441.503517 2172.71384 3404.975112 3005.615048
min -99.000000 -99.000000 -99.00000 -99.000000 -99.000000
25% 34.000000 309.000000 226.00000 539.000000 414.000000
50% 397.000000 1023.000000 870.00000 1553.000000 1324.000000
75% 1080.000000 2287.000000 2030.00000 3296.000000 2936.000000
max 29664.000000 88364.000000 54651.00000 125352.000000 87312.000000
MB005
count 77207.000000
mean 5.976272
std 3.641814
min 0.000000
25% 3.000000
50% 5.000000
75% 8.000000
max 47.000000
[8 rows x 86 columns]
# Check for missing values
print(df.isnull().sum())
Unnamed: 0.1 0
Unnamed: 0 0
id 0
loan_default 0
AP001 0
...
CD170 381
CD172 381
CD173 381
MB005 2793
MB007 0
Length: 89, dtype: int64
Distribution of the target variable
loan_default
sns.countplot(x='loan_default', data=df)
plt.title('Distribution of loan_default')
plt.show()
# Selected variables for EDA
features = ['AP001', 'AP003', 'AP008',
'CR009', 'CR015', 'CR019',
'PA022', 'PA023', 'PA029',
'TD001', 'TD005', 'TD006', 'TD009', 'TD010', 'TD014',
'loan_default']
# Create a subset of the DataFrame with selected variables
df_subset = df[features]
Pairwise Scatterplot Matrix:
Patterns:
- Positive correlation: If the points tend to form an upward-sloping pattern, it suggests a positive correlation between the variables. When one variable increases, the other tends to increase as well.
- Negative correlation: If the points tend to form a downward-sloping pattern, it suggests a negative correlation. When one variable increases, the other tends to decrease.
- No correlation: If the points are scattered with no clear pattern, it suggests little to no correlation between the variables.
# Pairwise scatterplot matrix to visualize relationships
sns.pairplot(df_subset, hue='loan_default', markers=["o", "s"])
plt.show()
Box Plots:
- Provide information about the distribution of a variable, allowing to compare how the variable's values differ for different levels of 'loan_default' (0 or 1).
# Box plots to compare variable distributions by loan_default
plt.figure(figsize=(16, 12))
num_subplots = len(features) - 1 # Number of subplots
rows = 4 # Number of rows for subplots
cols = num_subplots // rows + 1 # Calculate the number of columns
for i, var in enumerate(features[:-1]):
plt.subplot(rows, cols, i + 1)
sns.boxplot(x='loan_default', y=var, data=df_subset)
plt.title(f'{var} vs loan_default')
plt.tight_layout()
plt.show()
Correlation Matrix:
- The correlation matrix displays numerical values representing the correlation between each pair of variables.
- Values range from -1 (perfect negative correlation) to 1 (perfect positive correlation), with 0 indicating no correlation.
# Calculate the correlation matrix
corr_matrix = df_subset.corr()
# Set a threshold for correlation values
threshold = 0.05
# Create a mask to display only the lower half of the matrix
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
# Create a custom colormap for the heatmap
cmap = sns.diverging_palette(220, 20, as_cmap=True)
# Plot the correlation matrix with the threshold
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap=cmap, mask=mask, fmt='.2f', linewidths=.5, cbar_kws={"shrink": 0.8})
# Highlight correlations above the threshold
plt.title(f'Correlation Matrix (Threshold = {threshold})')
plt.show()
Line Plot: Features vs. loan_default
for v in features:
sns.lineplot(data = df, x = v, y = 'loan_default')
plt.xlabel(v)
plt.ylabel('loan_default')
plt.show()
AP001 (YR_AGE): The plot indicates that older applicants are less likely to default on their loans, as there is a noticeable downward trend where an increase in age corresponds to a decrease in the loan default rate.AP003 (Code_education): Applicants with higher education backgrounds are significantly less likely to default on their loans. The plot shows a strong downward trend, suggesting that higher education levels are associated with a lower loan default rate.AP008 (Flag_IP_City_Not_Appl_City): Higher values of AP008, which may indicate a greater discrepancy between the applicant's IP address city and the city provided in their application, are associated with a higher likelihood of loan default. The plot shows an upward trend in loan default with increasing values of AP008.CR009 (AMT_LOAN_TOTAL): There is no clear trend between the total amount of loans an applicant already has (CR009) and the likelihood of loan default. The loan default rate does not appear to be significantly affected by the total loan amount.CR015 (MONTH_CREDIT_CARD_MOB_MAX): The plot suggests that applicants with a credit card activity duration of around 2 to 3 months have a higher likelihood of loan default. However, this likelihood decreases for applicants with longer credit card activity (over 3 months).CR019 (SCORE_SINGLE_DEBIT_CARD_LIMIT): Applicants with higher credit limit scores are less likely to default on their loans. The plot shows a negative relationship between the credit limit score and loan default rate.PA022 (DAYS_BTW_APPLICATION_AND_FIRST_COLLECTION_OR_HIGH_RISK_CALL): The plot does not reveal a clear trend, making it difficult to determine the relationship between this variable and loan default.PA023 (DAYS_BTW_APPLICATION_AND_FIRST_COLLECTION_CALL): Similar to PA022, the plot does not provide a clear trend to determine the relationship between this variable and loan default.PA029 (Average Length of Collection or High-Risk Inbound Calls): The plot does not show a distinct trend, making it challenging to establish a direct relationship with loan default.TD001 (Total Count of Queries in the Last 7 Days for Person-to-Person Transactions): An increase in the total count of queries in the last 7 days is associated with a higher likelihood of loan default, indicating an upward trend.TD005 (Total Count of Queries in the Last 1 Month for Person-to-Person Transactions): Similarly, a higher total count of queries in the last month correlates with an increased likelihood of loan default.TD006 (Total Count of Queries in the Last 1 Month for Small Loan): This variable exhibits an upward trend, suggesting that more queries in the last month for small loans are associated with a higher loan default rate.TD009 (Total Count of Queries in the Last 3 Months for Person-to-Person Transactions): A higher count of queries in the last 3 months for person-to-person transactions corresponds to a greater likelihood of loan default, demonstrating an upward trend.TD010 (Total Count of Queries in the Last 3 Months for Small Loan): Similarly, a higher count of queries in the last 3 months for small loans is associated with an increased likelihood of loan default.TD014 (Total Count of Queries in the Last 6 Months for Small Loan): The variable exhibits an upward trend, indicating that more queries in the last 6 months for small loans are associated with a higher loan default rate.Impute missing value
missing_values = df_subset.isnull().sum()
missing_values
AP001 0 AP003 0 AP008 0 CR009 0 CR015 0 CR019 0 PA022 381 PA023 381 PA029 381 TD001 0 TD005 0 TD006 0 TD009 0 TD010 0 TD014 0 loan_default 0 dtype: int64
missing_values_per = (df_subset.isnull().sum() / df_subset.isnull().count())
missing_values_per
AP001 0.000000 AP003 0.000000 AP008 0.000000 CR009 0.000000 CR015 0.000000 CR019 0.000000 PA022 0.004763 PA023 0.004763 PA029 0.004763 TD001 0.000000 TD005 0.000000 TD006 0.000000 TD009 0.000000 TD010 0.000000 TD014 0.000000 loan_default 0.000000 dtype: float64
# Check for missing values
missing_values = df_subset.isnull().sum()
missing_values_per = (df_subset.isnull().sum() / df_subset.isnull().count())
pd.concat([missing_values, missing_values_per],axis=1, keys = ['Count_of_Missing','Percentage'])
| Count_of_Missing | Percentage | |
|---|---|---|
| AP001 | 0 | 0.000000 |
| AP003 | 0 | 0.000000 |
| AP008 | 0 | 0.000000 |
| CR009 | 0 | 0.000000 |
| CR015 | 0 | 0.000000 |
| CR019 | 0 | 0.000000 |
| PA022 | 381 | 0.004763 |
| PA023 | 381 | 0.004763 |
| PA029 | 381 | 0.004763 |
| TD001 | 0 | 0.000000 |
| TD005 | 0 | 0.000000 |
| TD006 | 0 | 0.000000 |
| TD009 | 0 | 0.000000 |
| TD010 | 0 | 0.000000 |
| TD014 | 0 | 0.000000 |
| loan_default | 0 | 0.000000 |
r = pd.concat([missing_values, missing_values_per],axis=1, keys = ['Count_of_Missing','Percentage']).reset_index()
r
| index | Count_of_Missing | Percentage | |
|---|---|---|---|
| 0 | AP001 | 0 | 0.000000 |
| 1 | AP003 | 0 | 0.000000 |
| 2 | AP008 | 0 | 0.000000 |
| 3 | CR009 | 0 | 0.000000 |
| 4 | CR015 | 0 | 0.000000 |
| 5 | CR019 | 0 | 0.000000 |
| 6 | PA022 | 381 | 0.004763 |
| 7 | PA023 | 381 | 0.004763 |
| 8 | PA029 | 381 | 0.004763 |
| 9 | TD001 | 0 | 0.000000 |
| 10 | TD005 | 0 | 0.000000 |
| 11 | TD006 | 0 | 0.000000 |
| 12 | TD009 | 0 | 0.000000 |
| 13 | TD010 | 0 | 0.000000 |
| 14 | TD014 | 0 | 0.000000 |
| 15 | loan_default | 0 | 0.000000 |
def null_cell(df_subset):
df_subsetx = df_subset.copy()
missing_values = df_subsetx.isnull().sum()
missing_values_per = df_subsetx.isnull().sum() / df_subsetx.isnull().count()
null_values = pd.concat([missing_values, missing_values_per],axis=1, keys = ['the_null','the_null_perc'])
null_values = null_values.sort_values('the_null',ascending=False)
return null_values[null_values['the_null']>0]
fill_list = (null_cell(df_subset)).index
fill_list
Index(['PA022', 'PA023', 'PA029'], dtype='object')
r.loc[r['index']=='type','Percentage']
Series([], Name: Percentage, dtype: float64)
df_mean = df_subset.copy()
for col in fill_list:
df_mean.loc[:,col].fillna(df_mean.loc[:,col].mean(),inplace=True)
df_mean.isnull().sum()
AP001 0 AP003 0 AP008 0 CR009 0 CR015 0 CR019 0 PA022 0 PA023 0 PA029 0 TD001 0 TD005 0 TD006 0 TD009 0 TD010 0 TD014 0 loan_default 0 dtype: int64
# Basic WOE Function
def WOE (var, df_mean):
result = df_mean[[var, 'loan_default']].groupby(var)['loan_default'].agg(['count', 'sum']).reset_index()
result.columns = [var,'Count','Good']
result['Bad'] = result['Count'] - result['Good']
result['Good %'] = (result['Good'] / result['Good'].sum()*100).round(2)
result['Bad %'] = (result['Bad'] / result['Bad'].sum()*100).round(2)
result[var+'_WOE'] = np.log(result['Good %'] / result['Bad %']).round(2)
result = result.sort_values(by=var+'_WOE')
return(result)
woe_AP001 = WOE('AP001',df_mean)
woe_AP001.head()
| AP001 | Count | Good | Bad | Good % | Bad % | AP001_WOE | |
|---|---|---|---|---|---|---|---|
| 33 | 53 | 274 | 37 | 237 | 0.24 | 0.37 | -0.43 |
| 34 | 54 | 276 | 43 | 233 | 0.28 | 0.36 | -0.25 |
| 25 | 45 | 981 | 162 | 819 | 1.05 | 1.27 | -0.19 |
| 29 | 49 | 614 | 102 | 512 | 0.66 | 0.79 | -0.18 |
| 28 | 48 | 683 | 116 | 567 | 0.75 | 0.88 | -0.16 |
target_variables = ['AP001', 'AP003', 'AP008',
'CR009', 'CR015', 'CR019',
'PA022', 'PA023', 'PA029',
'TD001', 'TD005', 'TD006',
'TD009', 'TD010', 'TD014']
# WOE Append function which allows to add WOE to original DF
def WOE_append(df_mean, target_variables):
for var in target_variables:
df_mean = df_mean.merge(WOE(var, df_mean)[[var, var + '_WOE']], on=var, how='left')
return df_mean
# Call the WOE_append function to merge columns
df_woe = WOE_append(df_mean, target_variables)
# Print the final merged dataframe
df_woe.head()
/Users/jimac/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /Users/jimac/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /Users/jimac/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /Users/jimac/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /Users/jimac/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs) /Users/jimac/opt/anaconda3/lib/python3.9/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
| AP001 | AP003 | AP008 | CR009 | CR015 | CR019 | PA022 | PA023 | PA029 | TD001 | ... | CR019_WOE | PA022_WOE | PA023_WOE | PA029_WOE | TD001_WOE | TD005_WOE | TD006_WOE | TD009_WOE | TD010_WOE | TD014_WOE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 31 | 1 | 3 | 35807 | 4 | 2 | -1.0 | -1.0 | -98.0 | 1 | ... | 0.17 | -0.15 | -0.12 | -0.13 | -0.20 | 0.05 | -0.11 | 0.01 | -0.20 | -0.09 |
| 1 | 27 | 1 | 4 | 7254 | 6 | 11 | -1.0 | -1.0 | -98.0 | 2 | ... | -0.19 | -0.15 | -0.12 | -0.13 | 0.02 | -0.03 | -0.11 | -0.24 | -0.20 | -0.09 |
| 2 | 33 | 4 | 2 | 30377 | 2 | 3 | -1.0 | -1.0 | -98.0 | 2 | ... | 0.13 | -0.15 | -0.12 | -0.13 | 0.02 | 0.05 | -0.11 | 0.01 | -0.20 | -0.26 |
| 3 | 34 | 4 | 5 | 0 | 5 | 8 | -1.0 | -1.0 | -98.0 | 0 | ... | -0.07 | -0.15 | -0.12 | -0.13 | -0.25 | -0.45 | -0.17 | -0.63 | -0.29 | -0.26 |
| 4 | 47 | 1 | 4 | 0 | 6 | 11 | -99.0 | -99.0 | -99.0 | 2 | ... | -0.19 | -0.30 | -0.30 | -0.30 | 0.02 | -0.20 | -0.11 | -0.39 | -0.20 | -0.26 |
5 rows × 31 columns
Original Dataset (Without Features - WOE Vars)
# Recall Variables
features
['AP001', 'AP003', 'AP008', 'CR009', 'CR015', 'CR019', 'PA022', 'PA023', 'PA029', 'TD001', 'TD005', 'TD006', 'TD009', 'TD010', 'TD014', 'loan_default']
# `df_mean' done with imputed for NAs
df_mean.isnull().sum()
AP001 0 AP003 0 AP008 0 CR009 0 CR015 0 CR019 0 PA022 0 PA023 0 PA029 0 TD001 0 TD005 0 TD006 0 TD009 0 TD010 0 TD014 0 loan_default 0 dtype: int64
# Create a modeling object
model = df_mean
model.head()
| AP001 | AP003 | AP008 | CR009 | CR015 | CR019 | PA022 | PA023 | PA029 | TD001 | TD005 | TD006 | TD009 | TD010 | TD014 | loan_default | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 31 | 1 | 3 | 35807 | 4 | 2 | -1.0 | -1.0 | -98.0 | 1 | 4 | 1 | 5 | 1 | 2 | 1 |
| 1 | 27 | 1 | 4 | 7254 | 6 | 11 | -1.0 | -1.0 | -98.0 | 2 | 3 | 1 | 3 | 1 | 2 | 0 |
| 2 | 33 | 4 | 2 | 30377 | 2 | 3 | -1.0 | -1.0 | -98.0 | 2 | 4 | 1 | 5 | 1 | 1 | 0 |
| 3 | 34 | 4 | 5 | 0 | 5 | 8 | -1.0 | -1.0 | -98.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 47 | 1 | 4 | 0 | 6 | 11 | -99.0 | -99.0 | -99.0 | 2 | 2 | 1 | 2 | 1 | 1 | 0 |
# Data Split
from sklearn.model_selection import train_test_split
X = model.drop('loan_default', axis = 1)
y = model['loan_default']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 1)
# Training decision tree model
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
DecisionTreeClassifier()
from sklearn.metrics import roc_curve, auc
# Model predictions
y_pred_prob = dtc.predict_proba(X_test)[:, 1]
# Calculate the ROC curve points
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# Calculate the AUC (Area Under Curve)
roc_auc = auc(fpr, tpr)
print(f"Original Dataset ROC AUC Score:{roc_auc:.4f}")
Original Dataset ROC AUC Score:0.5244
# Plot the ROC curve
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
y_pred = dtc.predict(X_test)
from sklearn.metrics import accuracy_score
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Print accuracy score
print(f"Original Dataset Accuracy:{accuracy:.4f}")
Original Dataset Accuracy:0.6876
With Features (WOE Vars) Dataset
# Recall Features
df_woe.columns
Index(['AP001', 'AP003', 'AP008', 'CR009', 'CR015', 'CR019', 'PA022', 'PA023',
'PA029', 'TD001', 'TD005', 'TD006', 'TD009', 'TD010', 'TD014',
'loan_default', 'AP001_WOE', 'AP003_WOE', 'AP008_WOE', 'CR009_WOE',
'CR015_WOE', 'CR019_WOE', 'PA022_WOE', 'PA023_WOE', 'PA029_WOE',
'TD001_WOE', 'TD005_WOE', 'TD006_WOE', 'TD009_WOE', 'TD010_WOE',
'TD014_WOE'],
dtype='object')
# Select WOE features
model2 = df_woe.iloc[:,-16:]
model2
| loan_default | AP001_WOE | AP003_WOE | AP008_WOE | CR009_WOE | CR015_WOE | CR019_WOE | PA022_WOE | PA023_WOE | PA029_WOE | TD001_WOE | TD005_WOE | TD006_WOE | TD009_WOE | TD010_WOE | TD014_WOE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.01 | 0.17 | 0.01 | inf | 0.15 | 0.17 | -0.15 | -0.12 | -0.13 | -0.20 | 0.05 | -0.11 | 0.01 | -0.20 | -0.09 |
| 1 | 0 | 0.11 | 0.17 | 0.10 | NaN | -0.27 | -0.19 | -0.15 | -0.12 | -0.13 | 0.02 | -0.03 | -0.11 | -0.24 | -0.20 | -0.09 |
| 2 | 0 | -0.03 | -0.51 | -0.09 | inf | -0.03 | 0.13 | -0.15 | -0.12 | -0.13 | 0.02 | 0.05 | -0.11 | 0.01 | -0.20 | -0.26 |
| 3 | 0 | -0.04 | -0.51 | 0.10 | -0.15 | 0.08 | -0.07 | -0.15 | -0.12 | -0.13 | -0.25 | -0.45 | -0.17 | -0.63 | -0.29 | -0.26 |
| 4 | 0 | -0.01 | 0.17 | 0.10 | -0.15 | -0.27 | -0.19 | -0.30 | -0.30 | -0.30 | 0.02 | -0.20 | -0.11 | -0.39 | -0.20 | -0.26 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 79995 | 0 | -0.08 | 0.17 | 0.01 | NaN | 0.08 | -0.10 | -0.15 | -0.12 | -0.13 | 0.02 | -0.20 | -0.17 | -0.11 | -0.29 | 0.07 |
| 79996 | 0 | 0.01 | -0.51 | 0.10 | 0.69 | 0.08 | 0.02 | -0.15 | -0.12 | -0.13 | -0.25 | -0.20 | -0.17 | -0.39 | -0.29 | -0.37 |
| 79997 | 0 | -0.13 | 0.17 | 0.01 | -0.15 | 0.32 | -0.10 | -0.15 | -0.12 | -0.13 | -0.20 | -0.51 | 0.09 | -0.61 | -0.02 | -0.09 |
| 79998 | 0 | -0.07 | 0.17 | -0.09 | 0.22 | 0.15 | 0.02 | -0.15 | -0.12 | -0.13 | -0.25 | -0.45 | -0.17 | -0.63 | -0.29 | -0.37 |
| 79999 | 1 | 0.07 | -0.13 | 0.01 | 0.18 | 0.08 | 0.02 | 0.11 | 0.42 | inf | 0.38 | 0.41 | 0.43 | 0.27 | 0.48 | 0.41 |
80000 rows × 16 columns
# Check NAs
model2.isnull().sum()
loan_default 0 AP001_WOE 0 AP003_WOE 0 AP008_WOE 0 CR009_WOE 21754 CR015_WOE 0 CR019_WOE 0 PA022_WOE 48 PA023_WOE 47 PA029_WOE 2570 TD001_WOE 2 TD005_WOE 0 TD006_WOE 1 TD009_WOE 3 TD010_WOE 3 TD014_WOE 7 dtype: int64
# Fix NAs
model2 = model2.replace([np.inf, -np.inf], np.nan).dropna()
model2 = model2.dropna()
model2.head()
| loan_default | AP001_WOE | AP003_WOE | AP008_WOE | CR009_WOE | CR015_WOE | CR019_WOE | PA022_WOE | PA023_WOE | PA029_WOE | TD001_WOE | TD005_WOE | TD006_WOE | TD009_WOE | TD010_WOE | TD014_WOE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 0 | -0.04 | -0.51 | 0.10 | -0.15 | 0.08 | -0.07 | -0.15 | -0.12 | -0.13 | -0.25 | -0.45 | -0.17 | -0.63 | -0.29 | -0.26 |
| 4 | 0 | -0.01 | 0.17 | 0.10 | -0.15 | -0.27 | -0.19 | -0.30 | -0.30 | -0.30 | 0.02 | -0.20 | -0.11 | -0.39 | -0.20 | -0.26 |
| 6 | 1 | 0.07 | -0.13 | -0.17 | -0.15 | -0.03 | 0.17 | -0.15 | -0.12 | -0.13 | 0.02 | -0.20 | -0.17 | -0.39 | -0.29 | -0.26 |
| 7 | 0 | 0.01 | -0.13 | 0.10 | -0.15 | 0.15 | 0.13 | -0.15 | -0.12 | -0.13 | -0.20 | -0.51 | 0.09 | -0.61 | -0.02 | -0.09 |
| 8 | 0 | 0.07 | -0.13 | 0.10 | 0.06 | -0.27 | 0.12 | -0.15 | -0.12 | -0.13 | 0.25 | 0.30 | -0.11 | 0.45 | 0.32 | 0.26 |
# Data Split
from sklearn.model_selection import train_test_split
X = model2.drop('loan_default', axis = 1)
y = model2['loan_default']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 1)
# Training decision tree model
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
DecisionTreeClassifier()
from sklearn.metrics import roc_curve, auc
# Model predictions
y_pred_prob = dtc.predict_proba(X_test)[:, 1]
# Calculate the ROC curve points
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# Calculate the AUC (Area Under Curve)
roc_auc = auc(fpr, tpr)
print(f"With WOE Features ROC AUC Score:{roc_auc:.4f}")
With WOE Features ROC AUC Score:0.5279
# Plot the ROC curve
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
y_pred = dtc.predict(X_test)
from sklearn.metrics import accuracy_score
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Print accuracy score
print(f"With WOE Features Accuracy:{accuracy:.4f}")
With WOE Features Accuracy:0.6983
0.5244 and an accuracy rate of 68.76%, indicating a moderate classification ability.0.5279, accompanied by an accuracy increase to 69.83%. This marked improvement underscores the transformation's efficacy in enhancing the model's capability to differentiate between classes.